﻿/*
cd /projects/hsieh_project/proj_201809/code_1_data/
qstata --dofile=data_2_ind_sum_add.do --statatype=mp --cpucount=5 &

Add additional industry level variables that we use in the analysis
e.g. ln(employment share of top 10% firms)
*/

di "Started at $S_DATE $S_TIME"

global rev_date: display %tdYYNNDD date("$S_DATE", "DMY")
di "${rev_date}"

global dir_proj "/projects/hsieh_project/proj_201809/"

global dir_data "${dir_proj}/data"

global ds_out = "${dir_data}/ind_sum_all_add"

global year1 = 1977
global year2 = 2013

global gl_perc = "1 10 20 30 40 50 60 70 80 90 100"
global gl_vgeo = "zipcode fips msa czone msa1983 msa1983cz"

//==============================================================================

use "${dir_data}/ind_sum_all", clear

if "${gl_perc}"=="" {
	local l_perc = "10"
}
else {
	local l_perc = "${gl_perc}"
}

/*
--------------------------------------------------------------------------------
Generate selected variables at industry level
*/

gen ln_emp_ind = ln(emp_ind) // ln(ind emp)
gen estn_ind = est_ind / n_ind // # of est per firm
gen ln_estn_ind = ln(estn_ind) // ln(# of est per firm)

// Helper variables for deciles
gen emp_ind_100 = emp_ind
gen emp_ind_0 = 0
gen pay_ind_100 = pay_ind
gen pay_ind_0 = 0

// Help variables for geo
foreach vgeo in $gl_vgeo {
    global vgeo = "`vgeo'"
    
	// Total # of markets
    gen ${vgeo}_ind_c2 = ind_${vgeo}_tot
    
	// Total # of markets in ${year1}
    gen ind_${vgeo}_tot_${year1} = ind_${vgeo}_tot if year == ${year1}
    sort ch_ind ind_${vgeo}_tot_${year1}
    by ch_ind: replace ind_${vgeo}_tot_${year1} = ind_${vgeo}_tot_${year1}[1]
	
	// ln(average # of markets)
	gen ln_mkt_${vgeo} = ln(mkt_${vgeo})
}

/*
--------------------------------------------------------------------------------
Function that generates geo-related variables for top firms
*/

capture program drop geo_var
program geo_var
	args vi_perc
	
	// Total # of markets of top firms (=sum(total # of markets of each top firm))
	gen ${geo}_ind_c2_`vi_perc' = ind_${geo}_tot_`vi_perc'
	// Ratio of total markets of top firms over all firms
	gen ${geo}_ind_r2_`vi_perc' = ${geo}_ind_c2_`vi_perc' / ${geo}_ind_c2
	gen ln_${geo}_ind_r2_`vi_perc' = ln(${geo}_ind_r2_`vi_perc')
	
	// Total # of markets of top firms in ${year1}
	gen ind_${geo}_tot_${year1}_`vi_perc' = ind_${geo}_tot_`vi_perc' if year == ${year1}
	sort ch_ind ind_${geo}_tot_${year1}_`vi_perc'
	by ch_ind: replace ind_${geo}_tot_${year1}_`vi_perc' = ind_${geo}_tot_${year1}_`vi_perc'[1]
	
	// Average market size (recall that this is average of average)
	gen ln_mkt_${geo}_`vi_perc' = ln(mkt_${geo}_`vi_perc')
	// Ratio of average market size of top firms over all firms
	gen ln_mkt_${geo}_r_`vi_perc' = ln_mkt_${geo}_`vi_perc' - ln_mkt_${geo}
	
end

/*
--------------------------------------------------------------------------------
Generates variables for top (and bottom) firms
*/

foreach li_perc in `l_perc' {
	di "-----------------------------------------"
	di "`li_perc'%"
	local vi_perc = subinstr("`li_perc'", ".", "_", .)
	
	// Mark industries with insufficient number of firms
	gen miss_`vi_perc' = 1 if n_ind < 100/`li_perc'
	
	if ! inlist(`li_perc',100) {
	
	/*
	-------------------------------------
	Employment share of top firms
	*/
	gen emps_ind_`vi_perc' = emp_ind_`vi_perc' / emp_ind
	replace emps_ind_`vi_perc' = . if miss_`vi_perc' == 1
	
	gen ln_emps_ind_`vi_perc' = ln(emps_ind_`vi_perc')
	replace ln_emps_ind_`vi_perc' = . if miss_`vi_perc' == 1

	/*
	-------------------------------------
	Ratio of establishments of top firms vs all firms
	*/
	gen est_ind_r_`vi_perc' = est_ind_`vi_perc' / est_ind
	replace est_ind_r_`vi_perc' = . if miss_`vi_perc' == 1

	gen ln_est_ind_r_`vi_perc' = ln(est_ind_r_`vi_perc')
	replace ln_est_ind_r_`vi_perc' = . if miss_`vi_perc' == 1
	
	/*
	-------------------------------------
	Ratio of Average # of est of top firms vs all firms
	*/
	gen estn_ind_`vi_perc' = est_ind_`vi_perc' / n_ind_`vi_perc'
	replace estn_ind_`vi_perc' = . if miss_`vi_perc' == 1
	
	gen ln_estn_ind_`vi_perc' = ln(estn_ind_`vi_perc')
	replace ln_estn_ind_`vi_perc' = . if miss_`vi_perc' == 1
	
	
	/*
	-------------------------------------
	Variables related to bottom (100 - `vi_perc') firms
	*/
	// Employment
	gen emp_ind_bot_`vi_perc' = emp_ind-emp_ind_`vi_perc' if miss_`vi_perc' != 1
	gen ln_emp_ind_bot_`vi_perc' = ln(emp_ind_bot_`vi_perc') if miss_`vi_perc' != 1
	
	// # of est
	gen est_ind_bot_`vi_perc' = est_ind-est_ind_`vi_perc' if miss_`vi_perc' != 1
	
	// # of firms
	gen n_ind_bot_`vi_perc' = n_ind-n_ind_`vi_perc' if miss_`vi_perc' != 1
	
	// Average # of est
	gen estn_ind_bot_`vi_perc' = est_ind_bot_`vi_perc' / n_ind_bot_`vi_perc'
	gen ln_estn_ind_bot_`vi_perc' = ln(estn_ind_bot_`vi_perc')

	
	/*
	-------------------------------------
	Geo-realted variables
	*/
	foreach vgeo in $gl_vgeo {
		global geo = "`vgeo'"
		geo_var `vi_perc'
	}
	
	/*
	-------------------------------------
	Variables related to intensive margins
	(i.e. employment per market, as a ratio of top vs all firms)
	*/
	gen ln_est_ind_rc_`vi_perc' = ln_emps_ind_`vi_perc' - ln_est_ind_r_`vi_perc'
	foreach vgeo in $gl_vgeo {
		gen ln_`vgeo'_ind_r2c_`vi_perc' = ln_emps_ind_`vi_perc' - ln_`vgeo'_ind_r2_`vi_perc'
	}
	}
	
	/*
	----------------------------------------------------------------------------
	Variables related to (`vi_perc' - 10)%-`vi_perc'% firms
	*/
	/*
	-------------------------------------
	Employment share of decile firms
	*/
	if `vi_perc' >= 10 {
	gen emp_ind_`vi_perc'd = emp_ind_`vi_perc' - emp_ind_`=`vi_perc'-10'
	gen emps_ind_`vi_perc'd = emp_ind_`vi_perc'd / emp_ind if miss_`vi_perc' != 1
	gen ln_emps_ind_`vi_perc'd = ln(emp_ind_`vi_perc'd / emp_ind) if miss_`vi_perc' != 1
	}
}

save $ds_out, replace
//export excel $ds_out.xlsx, replace firstrow(var)

//save ${ds_out}_${rev_date}, replace
//export excel ${ds_out}_${rev_date}.xlsx, replace firstrow(var)

di "Ended at $S_DATE $S_TIME"
// End of do file
